사용한 버전: pyspark version 3.1.2/ hadoop version 3.2/ mongo-spark-connector 2.12:3.0.1
설치 관련 사이트
https://ahnty0122.tistory.com/22
https://github.com/cdarlint/winutils/tree/master/hadoop-3.2.0/bin
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
import pandas as pd
conf = pyspark.SparkConf().set("spark.jars.packages","org.mongodb.spark:mongo-spark-connector_2.12:3.0.1").setMaster("local").setAppName("My First Spark Job").setAll([('spark.driver.memory', '40g'),('spark.executor.memory','50g')])
sc = SparkContext(conf=conf)
sqlC = SQLContext(sc)
mongo_ip = 'mongodb://localhost:27017/AI_origin_data.DDH_DAEYOU'
DDH_DAEYOU = sqlC.read.format('com.mongodb.spark.sql.DefaultSource').option('uri',mongo_ip).load()
DDH_DAEYOU.createOrReplaceTempView('DDH_DAEYOU')
DDH_DAEYOU
DataFrame[FolderIndex: string, _id: struct<oid:string>, archLowerData: struct<archType:string,toothData:array<struct<check:int,toothNumber:int,abutmentNumber:string,prepNumber:string>>>, archUpperData: struct<archType:string,toothData:array<struct<check:int,toothNumber:int,abutmentNumber:string,prepNumber:string>>>, bridgeNumber: string, center: string, crownNumber: string, guideplaneTask: string, memo: string, nasFolderPath: string, segmentationTask: string]
DDH_DAEYOU.printSchema()
root |-- FolderIndex: string (nullable = true) |-- _id: struct (nullable = true) | |-- oid: string (nullable = true) |-- archLowerData: struct (nullable = true) | |-- archType: string (nullable = true) | |-- toothData: array (nullable = true) | | |-- element: struct (containsNull = true) | | | |-- check: integer (nullable = true) | | | |-- toothNumber: integer (nullable = true) | | | |-- abutmentNumber: string (nullable = true) | | | |-- prepNumber: string (nullable = true) |-- archUpperData: struct (nullable = true) | |-- archType: string (nullable = true) | |-- toothData: array (nullable = true) | | |-- element: struct (containsNull = true) | | | |-- check: integer (nullable = true) | | | |-- toothNumber: integer (nullable = true) | | | |-- abutmentNumber: string (nullable = true) | | | |-- prepNumber: string (nullable = true) |-- bridgeNumber: string (nullable = true) |-- center: string (nullable = true) |-- crownNumber: string (nullable = true) |-- guideplaneTask: string (nullable = true) |-- memo: string (nullable = true) |-- nasFolderPath: string (nullable = true) |-- segmentationTask: string (nullable = true)
DDH_DAEYOU_df = sqlC.read.format('com.mongodb.spark.sql.DefaultSource').option('uri',mongo_ip).load().toPandas()
DDH_DAEYOU_df
| FolderIndex | _id | archLowerData | archUpperData | bridgeNumber | center | crownNumber | guideplaneTask | memo | nasFolderPath | segmentationTask | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DDH_100 | (60f68ea5b1aa1e48ee90baf5,) | (full, [(41, 41, , ), (42, 42, , ), (43, 43, ,... | (full, [(21, 21, , ), (22, 22, , ), (23, 23, ,... | DDH | Done | Data/confident_db/DDH_100 | Done | |||
| 1 | DDH_101 | (60f68ea5b1aa1e48ee90baf6,) | (full, [(41, 41, , ), (42, 42, , ), (43, 43, ,... | (full, [(21, 21, , ), (22, 22, , ), (23, 23, ,... | DDH | Done | Data/confident_db/DDH_101 | Done | |||
| 2 | DDH_102 | (60f68ea6b1aa1e48ee90baf7,) | (full, [(41, 41, , ), (42, 42, , ), (43, 43, ,... | (full, [(21, 21, , ), (22, 22, , ), (23, 23, ,... | DDH | Done | Data/confident_db/DDH_102 | Done | |||
| 3 | DDH_103 | (60f68ea6b1aa1e48ee90baf8,) | (full, [(41, 41, , ), (42, 42, , ), (43, 43, ,... | (full, [(21, 21, , ), (22, 22, , ), (23, 23, ,... | DDH | Done | Data/confident_db/DDH_103 | Raw | |||
| 4 | DDH_104 | (60f68ea7b1aa1e48ee90baf9,) | (full, [(41, 41, , ), (42, 42, , ), (43, 43, ,... | (full, [(21, 21, , ), (22, 22, , ), (23, 23, ,... | DDH | Done | Data/confident_db/DDH_104 | Done | |||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1038 | DAEYOU_2922 | (60f6926bb1aa1e48ee90bf03,) | (full, [(41, 41, , ), (42, 42, , ), (43, 43, ,... | (full, [(21, 21, , ), (22, 22, , ), (23, 23, ,... | DAEYOU | Done | Data/confident_db/DAEYOU_2922 | Done | |||
| 1039 | DAEYOU_2923 | (60f6926db1aa1e48ee90bf04,) | (full, [(41, 41, , ), (42, 42, , ), (43, 43, ,... | (full, [(21, 21, , ), (22, 22, , ), (23, 23, ,... | DAEYOU | Done | Data/confident_db/DAEYOU_2923 | Raw | |||
| 1040 | DAEYOU_2924 | (60f6926eb1aa1e48ee90bf05,) | (full, [(41, 41, , ), (42, 42, , ), (43, 43, ,... | (full, [(21, 21, , ), (22, 22, , ), (23, 23, ,... | DAEYOU | Done | Data/confident_db/DAEYOU_2924 | Done | |||
| 1041 | DAEYOU_2925 | (60f6926fb1aa1e48ee90bf06,) | (full, [(41, 41, , ), (42, 42, , ), (43, 43, ,... | (full, [(21, 21, , ), (22, 22, , ), (23, 23, ,... | DAEYOU | Done | Data/confident_db/DAEYOU_2925 | Done | |||
| 1042 | DAEYOU_2926 | (60f69270b1aa1e48ee90bf07,) | (full, [(41, 41, , ), (42, 42, , ), (43, 43, ,... | (full, [(21, 21, , ), (22, 22, , ), (23, 23, ,... | DAEYOU | Done | Data/confident_db/DAEYOU_2926 | Done |
1043 rows × 11 columns
DDH_DAEYOU.show()
+-----------+--------------------+--------------------+--------------------+------------+------+-----------+--------------+----+--------------------+----------------+
|FolderIndex| _id| archLowerData| archUpperData|bridgeNumber|center|crownNumber|guideplaneTask|memo| nasFolderPath|segmentationTask|
+-----------+--------------------+--------------------+--------------------+------------+------+-----------+--------------+----+--------------------+----------------+
| DDH_100|{60f68ea5b1aa1e48...|{full, [{41, 41, ...|{full, [{21, 21, ...| | DDH| | Done| |Data/confident_db...| Done|
| DDH_101|{60f68ea5b1aa1e48...|{full, [{41, 41, ...|{full, [{21, 21, ...| | DDH| | Done| |Data/confident_db...| Done|
| DDH_102|{60f68ea6b1aa1e48...|{full, [{41, 41, ...|{full, [{21, 21, ...| | DDH| | Done| |Data/confident_db...| Done|
| DDH_103|{60f68ea6b1aa1e48...|{full, [{41, 41, ...|{full, [{21, 21, ...| | DDH| | Done| |Data/confident_db...| Raw|
| DDH_104|{60f68ea7b1aa1e48...|{full, [{41, 41, ...|{full, [{21, 21, ...| | DDH| | Done| |Data/confident_db...| Done|
| DDH_105|{60f68ea7b1aa1e48...|{full, [{41, 41, ...|{full, [{21, 21, ...| | DDH| | Done| |Data/confident_db...| Done|
| DDH_106|{60f68ea7b1aa1e48...|{full, [{41, 41, ...|{full, [{21, 21, ...| | DDH| | Done| |Data/confident_db...| Done|
| DDH_107|{60f68ea8b1aa1e48...|{full, [{41, 41, ...|{full, [{21, 21, ...| | DDH| | Done| |Data/confident_db...| Done|
| DDH_108|{60f68ea8b1aa1e48...|{full, [{41, 41, ...|{full, [{21, 21, ...| | DDH| | Done| |Data/confident_db...| Done|
| DDH_109|{60f68ea9b1aa1e48...|{full, [{41, 41, ...|{full, [{21, 21, ...| | DDH| | Done| |Data/confident_db...| Done|
| DDH_110|{60f68ea9b1aa1e48...|{full, [{41, 41, ...|{full, [{21, 21, ...| | DDH| | Done| |Data/confident_db...| Done|
| DDH_111|{60f68ea9b1aa1e48...|{full, [{41, 41, ...|{full, [{21, 21, ...| | DDH| | Done| |Data/confident_db...| Done|
| DDH_112|{60f68eaab1aa1e48...|{full, [{41, 41, ...|{full, [{21, 21, ...| | DDH| | Done| |Data/confident_db...| Done|
| DDH_113|{60f68eabb1aa1e48...|{full, [{41, 41, ...|{full, [{21, 21, ...| | DDH| | Done| |Data/confident_db...| Done|
| DDH_114|{60f68eabb1aa1e48...|{full, [{41, 41, ...|{full, [{21, 21, ...| | DDH| | Done| |Data/confident_db...| Done|
| DDH_115|{60f68eabb1aa1e48...|{full, [{41, 41, ...|{full, [{21, 21, ...| | DDH| | Done| |Data/confident_db...| Done|
| DDH_116|{60f68eacb1aa1e48...|{full, [{41, 41, ...|{full, [{21, 21, ...| | DDH| | Done| |Data/confident_db...| Done|
| DDH_117|{60f68eacb1aa1e48...|{full, [{41, 41, ...|{full, [{21, 21, ...| | DDH| | Done| |Data/confident_db...| Done|
| DDH_118|{60f68eacb1aa1e48...|{full, [{41, 41, ...|{full, [{21, 21, ...| | DDH| | Done| |Data/confident_db...| Done|
| DDH_119|{60f68eadb1aa1e48...|{full, [{41, 41, ...|{full, [{21, 21, ...| | DDH| | Done| |Data/confident_db...| Done|
+-----------+--------------------+--------------------+--------------------+------------+------+-----------+--------------+----+--------------------+----------------+
only showing top 20 rows
DDH_DAEYOU.select('archLowerData.toothData.toothNumber', 'FolderIndex','center','nasFolderPath').show()
+--------------------+-----------+------+--------------------+ | toothNumber|FolderIndex|center| nasFolderPath| +--------------------+-----------+------+--------------------+ |[41, 42, 43, 44, ...| DDH_100| DDH|Data/confident_db...| |[41, 42, 43, 44, ...| DDH_101| DDH|Data/confident_db...| |[41, 42, 43, 44, ...| DDH_102| DDH|Data/confident_db...| |[41, 42, 43, 44, ...| DDH_103| DDH|Data/confident_db...| |[41, 42, 43, 44, ...| DDH_104| DDH|Data/confident_db...| |[41, 42, 43, 45, ...| DDH_105| DDH|Data/confident_db...| |[41, 42, 43, 45, ...| DDH_106| DDH|Data/confident_db...| |[41, 42, 43, 45, ...| DDH_107| DDH|Data/confident_db...| |[41, 42, 43, 44, ...| DDH_108| DDH|Data/confident_db...| |[41, 42, 43, 44, ...| DDH_109| DDH|Data/confident_db...| |[41, 42, 43, 44, ...| DDH_110| DDH|Data/confident_db...| |[41, 42, 43, 44, ...| DDH_111| DDH|Data/confident_db...| |[41, 42, 43, 44, ...| DDH_112| DDH|Data/confident_db...| |[41, 42, 43, 45, ...| DDH_113| DDH|Data/confident_db...| |[41, 42, 43, 44, ...| DDH_114| DDH|Data/confident_db...| |[41, 42, 43, 44, ...| DDH_115| DDH|Data/confident_db...| |[41, 42, 43, 45, ...| DDH_116| DDH|Data/confident_db...| |[41, 42, 43, 44, ...| DDH_117| DDH|Data/confident_db...| |[41, 42, 43, 44, ...| DDH_118| DDH|Data/confident_db...| |[41, 42, 43, 44, ...| DDH_119| DDH|Data/confident_db...| +--------------------+-----------+------+--------------------+ only showing top 20 rows
sqlC.sql("SELECT nasFolderPath FROM DDH_DAEYOU WHERE array_contains(archLowerData.toothData.toothNumber, 46)").toPandas()
| nasFolderPath | |
|---|---|
| 0 | Data/confident_db/DDH_100 |
| 1 | Data/confident_db/DDH_101 |
| 2 | Data/confident_db/DDH_102 |
| 3 | Data/confident_db/DDH_103 |
| 4 | Data/confident_db/DDH_104 |
| ... | ... |
| 1012 | Data/confident_db/DAEYOU_2922 |
| 1013 | Data/confident_db/DAEYOU_2923 |
| 1014 | Data/confident_db/DAEYOU_2924 |
| 1015 | Data/confident_db/DAEYOU_2925 |
| 1016 | Data/confident_db/DAEYOU_2926 |
1017 rows × 1 columns
DDH_DAEYOU.select('FolderIndex','nasFolderPath').filter(DDH_DAEYOU.center=='DDH').show()
+-----------+--------------------+ |FolderIndex| nasFolderPath| +-----------+--------------------+ | DDH_100|Data/confident_db...| | DDH_101|Data/confident_db...| | DDH_102|Data/confident_db...| | DDH_103|Data/confident_db...| | DDH_104|Data/confident_db...| | DDH_105|Data/confident_db...| | DDH_106|Data/confident_db...| | DDH_107|Data/confident_db...| | DDH_108|Data/confident_db...| | DDH_109|Data/confident_db...| | DDH_110|Data/confident_db...| | DDH_111|Data/confident_db...| | DDH_112|Data/confident_db...| | DDH_113|Data/confident_db...| | DDH_114|Data/confident_db...| | DDH_115|Data/confident_db...| | DDH_116|Data/confident_db...| | DDH_117|Data/confident_db...| | DDH_118|Data/confident_db...| | DDH_119|Data/confident_db...| +-----------+--------------------+ only showing top 20 rows
sqlC.sql("SELECT FolderIndex, nasFolderPath FROM DDH_DAEYOU WHERE center = 'DDH'").toPandas()
| FolderIndex | nasFolderPath | |
|---|---|---|
| 0 | DDH_100 | Data/confident_db/DDH_100 |
| 1 | DDH_101 | Data/confident_db/DDH_101 |
| 2 | DDH_102 | Data/confident_db/DDH_102 |
| 3 | DDH_103 | Data/confident_db/DDH_103 |
| 4 | DDH_104 | Data/confident_db/DDH_104 |
| ... | ... | ... |
| 158 | DDH_5 | Data/confident_db/DDH_5 |
| 159 | DDH_6 | Data/confident_db/DDH_6 |
| 160 | DDH_7 | Data/confident_db/DDH_7 |
| 161 | DDH_8 | Data/confident_db/DDH_8 |
| 162 | DDH_9 | Data/confident_db/DDH_9 |
163 rows × 2 columns
DDH_DAEYOU.groupBy('center').count().show()
+------+-----+ |center|count| +------+-----+ |DAEYOU| 880| | DDH| 163| +------+-----+
sqlC.sql('SELECT center, COUNT(*) AS count FROM DDH_DAEYOU GROUP BY center').toPandas()
| center | count | |
|---|---|---|
| 0 | DAEYOU | 880 |
| 1 | DDH | 163 |
label_ip = 'mongodb+srv://CloudAdmin:Imagoworks.ai@cluster0.3f0kt.azure.mongodb.net/3DmeLabel_Dev.test?authSource=admin&replicaSet=atlas-5dpfiu-shard-0&w=majority&readPreference=primary&appname=MongoDB%20Compass&retryWrites=true&ssl=true'
label_df = sqlC.read.format('com.mongodb.spark.sql.DefaultSource').option('uri',label_ip).load()
label_df.createOrReplaceTempView('label_df')
label_df.toPandas()
| FolderIndex | _id | archLowerData | archUpperData | bridgeNumber | center | crownNumber | guideplaneTask | memo | nasFolderPath | segmentationTask | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DDH_100 | (60ffab8924e75e4b9ea14f47,) | (full, [(, 41, , 41), (, 42, , 42), (, 43, , 4... | (full, [(, 21, , 21), (, 22, , 22), (, 23, , 2... | DDH | Done | Data/confident_db/DDH_100 | Done | |||
| 1 | DDH_101 | (60ffab8924e75e4b9ea14f48,) | (full, [(, 41, , 41), (, 42, , 42), (, 43, , 4... | (full, [(, 21, , 21), (, 22, , 22), (, 23, , 2... | DDH | Done | Data/confident_db/DDH_101 | Done | |||
| 2 | DDH_102 | (60ffab8924e75e4b9ea14f49,) | (full, [(, 41, , 41), (, 42, , 42), (, 43, , 4... | (full, [(, 21, , 21), (, 22, , 22), (, 23, , 2... | DDH | Done | Data/confident_db/DDH_102 | Done | |||
| 3 | DDH_103 | (60ffab8924e75e4b9ea14f4a,) | (full, [(, 41, , 41), (, 42, , 42), (, 43, , 4... | (full, [(, 21, , 21), (, 22, , 22), (, 23, , 2... | DDH | Done | Data/confident_db/DDH_103 | Raw | |||
| 4 | DDH_104 | (60ffab8924e75e4b9ea14f4b,) | (full, [(, 41, , 41), (, 42, , 42), (, 43, , 4... | (full, [(, 21, , 21), (, 22, , 22), (, 23, , 2... | DDH | Done | Data/confident_db/DDH_104 | Done | |||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1038 | DAEYOU_2922 | (60ffab8924e75e4b9ea15355,) | (full, [(, 41, , 41), (, 42, , 42), (, 43, , 4... | (full, [(, 21, , 21), (, 22, , 22), (, 23, , 2... | DAEYOU | Done | Data/confident_db/DAEYOU_2922 | Done | |||
| 1039 | DAEYOU_2923 | (60ffab8924e75e4b9ea15356,) | (full, [(, 41, , 41), (, 42, , 42), (, 43, , 4... | (full, [(, 21, , 21), (, 22, , 22), (, 23, , 2... | DAEYOU | Done | Data/confident_db/DAEYOU_2923 | Raw | |||
| 1040 | DAEYOU_2924 | (60ffab8924e75e4b9ea15357,) | (full, [(, 41, , 41), (, 42, , 42), (, 43, , 4... | (full, [(, 21, , 21), (, 22, , 22), (, 23, , 2... | DAEYOU | Done | Data/confident_db/DAEYOU_2924 | Done | |||
| 1041 | DAEYOU_2925 | (60ffab8924e75e4b9ea15358,) | (full, [(, 41, , 41), (, 42, , 42), (, 43, , 4... | (full, [(, 21, , 21), (, 22, , 22), (, 23, , 2... | DAEYOU | Done | Data/confident_db/DAEYOU_2925 | Done | |||
| 1042 | DAEYOU_2926 | (60ffab8924e75e4b9ea15359,) | (full, [(, 41, , 41), (, 42, , 42), (, 43, , 4... | (full, [(, 21, , 21), (, 22, , 22), (, 23, , 2... | DAEYOU | Done | Data/confident_db/DAEYOU_2926 | Done |
1043 rows × 11 columns
sqlC.sql("SELECT FolderIndex, nasFolderPath, archLowerData.toothData.toothNumber, archUpperData.toothData.toothNumber FROM label_df ").toPandas()
| FolderIndex | nasFolderPath | toothNumber | toothNumber | |
|---|---|---|---|---|
| 0 | DDH_100 | Data/confident_db/DDH_100 | [41, 42, 43, 44, 45, 46, 47, 31, 32, 33, 34, 3... | [21, 22, 23, 24, 25, 26, 27, 11, 12, 13, 14, 1... |
| 1 | DDH_101 | Data/confident_db/DDH_101 | [41, 42, 43, 44, 45, 46, 47, 31, 32, 33, 34, 3... | [21, 22, 23, 24, 25, 26, 27, 11, 12, 13, 14, 1... |
| 2 | DDH_102 | Data/confident_db/DDH_102 | [41, 42, 43, 44, 45, 46, 47, 31, 32, 33, 34, 3... | [21, 22, 23, 24, 25, 26, 27, 11, 12, 13, 14, 1... |
| 3 | DDH_103 | Data/confident_db/DDH_103 | [41, 42, 43, 44, 45, 46, 47, 31, 32, 33, 34, 3... | [21, 22, 23, 24, 25, 26, 27, 11, 12, 13, 14, 1... |
| 4 | DDH_104 | Data/confident_db/DDH_104 | [41, 42, 43, 44, 45, 46, 47, 31, 32, 33, 34, 3... | [21, 22, 23, 24, 25, 26, 27, 11, 12, 13, 14, 1... |
| ... | ... | ... | ... | ... |
| 1038 | DAEYOU_2922 | Data/confident_db/DAEYOU_2922 | [41, 42, 43, 44, 45, 46, 47, 31, 32, 33, 34, 3... | [21, 22, 23, 24, 25, 26, 27, 11, 12, 13, 14, 1... |
| 1039 | DAEYOU_2923 | Data/confident_db/DAEYOU_2923 | [41, 42, 43, 44, 45, 46, 31, 32, 33, 34, 35, 36] | [21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16] |
| 1040 | DAEYOU_2924 | Data/confident_db/DAEYOU_2924 | [41, 42, 43, 44, 45, 46, 47, 31, 32, 33, 34, 3... | [21, 22, 23, 24, 25, 26, 27, 11, 12, 13, 14, 1... |
| 1041 | DAEYOU_2925 | Data/confident_db/DAEYOU_2925 | [41, 42, 43, 44, 45, 46, 47, 31, 32, 33, 34, 3... | [21, 22, 23, 24, 25, 26, 27, 11, 12, 13, 14, 1... |
| 1042 | DAEYOU_2926 | Data/confident_db/DAEYOU_2926 | [41, 42, 43, 44, 45, 46, 47, 48, 31, 32, 33, 3... | [21, 22, 23, 24, 25, 26, 27, 28, 11, 12, 13, 1... |
1043 rows × 4 columns